{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "#文档向量\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from gensim import utils\n",
    "from gensim.models.doc2vec import TaggedDocument\n",
    "from gensim.models import Doc2Vec\n",
    "from gensim.parsing.preprocessing import preprocess_string,remove_stopwords\n",
    "import random\n",
    "import warnings\n",
    "from scipy.spatial.distance import pdist\n",
    "warnings.filterwarnings(\"ignore\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "import scipy.io\n",
    "from collections import Counter\n",
    "from itertools import product\n",
    "from tqdm import trange\n",
    "from tqdm import tqdm\n",
    "from copy import deepcopy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "metadata": {},
   "outputs": [],
   "source": [
    "def normalize(word_vec): \n",
    "    norm=np.linalg.norm(word_vec,ord=2,axis=0) \n",
    "    if norm == 0: \n",
    "         return word_vec \n",
    "    return word_vec/norm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                          peco  \\\n",
      "0     sodium chloride exposure   \n",
      "1             unknown exposure   \n",
      "2  Pseudomonas avenae exposure   \n",
      "3             greenhouse study   \n",
      "4       abscisic acid exposure   \n",
      "\n",
      "                                                 def  \n",
      "0  A salt exposure (PECO:0007185) involving the u...  \n",
      "1  A plant exposure (PECO:0001001) where there is...  \n",
      "2  The treatment involving exposure of plant to t...  \n",
      "3  The treatment where the plants were grown unde...  \n",
      "4  A growth hormone exposure (PECO:0007165) invol...  \n",
      "<class 'pandas.core.frame.DataFrame'>\n"
     ]
    }
   ],
   "source": [
    "lines = pd.read_excel(r'..\\data\\peco_def.xlsx')\n",
    "print(lines.head())\n",
    "print(type(lines))       "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "class DocumentDataset(object) :\n",
    "    def __init__(self,data:pd.DataFrame,column) :\n",
    "        document = data[column].apply(self.preprocess)\n",
    "        self.documents = [TaggedDocument(text,[index])\n",
    "        for index,text in document.iteritems()]\n",
    "\n",
    "    def preprocess(self,document) :\n",
    "        return preprocess_string(remove_stopwords(document))\n",
    "    \n",
    "    def __iter_(self):\n",
    "        for document in self.documents:\n",
    "            yield documents\n",
    "    def tagged_documents(self,shuffle=False) :\n",
    "        if shuffle:\n",
    "            random.shuffle(self.documents)\n",
    "        return self.documents\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [],
   "source": [
    "document_dataset = DocumentDataset(lines,'def')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "docVecModel = Doc2Vec(min_count=1,\n",
    "                    window=5,\n",
    "                    vector_size=100,sample=1e-4,\n",
    "                    negative=5,\n",
    "                    workers=2)\n",
    "\n",
    "docVecModel.build_vocab(document_dataset.tagged_documents()) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [],
   "source": [
    "docVecModel.train(document_dataset.tagged_documents(shuffle=True),\n",
    "            total_examples = docVecModel.corpus_count,\n",
    "            epochs=10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [],
   "source": [
    "docVecModel.save(r'pecoVecModel.d2v')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {},
   "outputs": [],
   "source": [
    "pl = []\n",
    "for i in range(len(lines['peco'])):    \n",
    "        pl.append(docVecModel[i])\n",
    "        \n",
    "PSSM = np.corrcoef(pl,pl)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 1.        , -0.01469787, -0.05122649, ...,  0.04645632,\n",
       "         0.15341735, -0.1502182 ],\n",
       "       [-0.01469787,  1.        , -0.02405414, ..., -0.09853404,\n",
       "        -0.01296746,  0.02360266],\n",
       "       [-0.05122649, -0.02405414,  1.        , ...,  0.17213865,\n",
       "        -0.00572574, -0.11191597],\n",
       "       ...,\n",
       "       [ 0.04645632, -0.09853404,  0.17213865, ...,  1.        ,\n",
       "        -0.00693678,  0.07415919],\n",
       "       [ 0.15341735, -0.01296746, -0.00572574, ..., -0.00693678,\n",
       "         1.        , -0.09784653],\n",
       "       [-0.1502182 ,  0.02360266, -0.11191597, ...,  0.07415919,\n",
       "        -0.09784653,  1.        ]])"
      ]
     },
     "execution_count": 84,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "PSSM"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.savetxt(r'PSSM.txt',PSSM, fmt='%f', delimiter=',')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[1.         0.98198051]\n",
      " [0.98198051 1.        ]]\n"
     ]
    }
   ],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                                       gene  \\\n",
      "0                            LOC_Os01g64660   \n",
      "1                            LOC_Os03g38000   \n",
      "2                            LOC_Os10g20630   \n",
      "3  BTH-induced ERF transcriptional factor 2   \n",
      "4                    Ent-kaurene synthase 6   \n",
      "\n",
      "                                                 def  \n",
      "0  Catalysis of the reaction: D-fructose 1,6-bisp...  \n",
      "1  Binding to an RNA molecule or a portion thereo...  \n",
      "2                   Binding to a calcium ion (Ca2+).  \n",
      "3  A transcription regulator activity that modula...  \n",
      "4  Reactions, triggered in response to the presen...  \n",
      "12187\n"
     ]
    }
   ],
   "source": [
    "lines = pd.read_excel(r'D..\\data\\\\gene_name_def.xlsx')\n",
    "print(lines.head())\n",
    "print(len(lines))  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "document_dataset = DocumentDataset(lines,'def')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "docVecModel.train(document_dataset.tagged_documents(shuffle=True),\n",
    "            total_examples = docVecModel.corpus_count,\n",
    "            epochs=10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "docVecModel.save(r'geneVecModel.d2v')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "gl = []\n",
    "for i in range(len(lines['gene'])):    \n",
    "    gl.append(docVecModel[i])\n",
    "        \n",
    "GSSM = np.corrcoef(gl,gl)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.savetxt('GSSM.txt', GSSM, fmt='%.3f')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
